import pandas as pd
from itertools import product
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# import sys
# args = sys.argv
# c = int(args[1])
c = 1000

df = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_{0}.csv'.format(c))

# Add sentiment of raw sentence
sentiments = pd.read_csv('../data/metadata/sentence_sentiments_nltk_vader_{0}.csv'.format(c))
df = df.merge(sentiments, on='sentence_raw', how='left', indicator=True)
df.drop(columns=['_merge'], inplace=True)

# Calculate Odds ratio
odds_ratio = df[(df.party == 'Republican')|(df.party == 'Democrat')][['narrative','party']]
odds_ratio['rep'] = 0
odds_ratio['rep'] = np.where(odds_ratio['party'] == 'Republican', 1, odds_ratio['rep'])
odds_ratio['rep'] = odds_ratio['rep'].groupby(odds_ratio['narrative']).transform('sum')
odds_ratio['n'] = 1
odds_ratio['frequency'] = odds_ratio['n'].groupby(odds_ratio['narrative']).transform('sum')
odds_ratio['dem'] = odds_ratio['frequency'] - odds_ratio['rep']
rep_narratives = odds_ratio.rep.sum()
dem_narratives = odds_ratio.dem.sum()
odds_ratio['or'] = (odds_ratio['rep']/odds_ratio['dem'])/((rep_narratives-odds_ratio['rep'])/(dem_narratives-odds_ratio['dem']))
odds_ratio['log_or'] =  np.log(odds_ratio['or'])
# Calculate the confidence interval for the Odds Ratio
odds_ratio['log_or_ci_lower'] = odds_ratio['log_or'] - 1.96 * np.sqrt((1/odds_ratio['rep']) + (1/odds_ratio['dem']) + (1/(rep_narratives-odds_ratio['rep'])) + (1/(dem_narratives-odds_ratio['dem'])))
odds_ratio['log_or_ci_upper'] = odds_ratio['log_or'] + 1.96 * np.sqrt((1/odds_ratio['rep']) + (1/odds_ratio['dem']) + (1/(rep_narratives-odds_ratio['rep'])) + (1/(dem_narratives-odds_ratio['dem'])))
odds_ratio = odds_ratio.drop_duplicates(subset=['narrative'])[['narrative', 'or', 'log_or', 'log_or_ci_lower', 'log_or_ci_upper']]
df = df.merge(odds_ratio, on='narrative', how='left', indicator=True)
df.drop(columns=['_merge'], inplace=True)

# Add topic shares with labels
topics_by_gpo_id = pd.read_csv('../data/metadata/topics_labels_{0}.csv'.format(c))
df = df.merge(topics_by_gpo_id, on='gpo_id', how='left', indicator=True)
df.drop(columns=['_merge'], inplace=True)

# Save dataframe
df.to_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_rich_{0}.csv'.format(c), index=False)
